Word Count = 2236
This report presents an exploratory data analysis (EDA) and predictive modeling for:
Medical Insurance Charges Prediction using multiple regression techniques, LASSO regression, Boosting, Random Forest and Bayesian Additive Regression Trees (BART)
Stroke Prediction using Step-wise logistic regression, LASSO, Linear Discriminant Analysis(LDA), Quadratic Discriminant Analysis (QDA) and Random Forest.
# Install packages if not installed (Once)
if (!require(tidyverse)) install.packages("tidyverse", dependencies=TRUE) # data manipulation
if (!require(car)) install.packages("car", dependencies=TRUE) # vif(); multicollinearity
if (!require(tableone)) install.packages("tableone", dependencies=TRUE) # descriptive statistics
if (!require(psych)) install.packages("psych", dependencies=TRUE) # descriptive statistics
if (!require(caret)) install.packages("caret", dependencies=TRUE) # data splitting and model training
if (!require(dplyr)) install.packages("dplyr", dependencies=TRUE) # data manipulation
if (!require(GGally)) install.packages("GGally", dependencies=TRUE) # pairwise scatter plot
if (!require(boot)) install.packages("boot", dependencies=TRUE) # Bootstrapping and cv
if (!require(corrplot)) install.packages("corrplot", dependencies=TRUE) # correlation matrix
if (!require(faraway)) install.packages("faraway", dependencies=TRUE) # for model diagnostics
if (!require(tidyr)) install.packages("tidyr", dependencies=TRUE) # data manipulation
if (!require(leaps)) install.packages("leaps", dependencies=TRUE) # variable selection, regsubsets
if (!require(MASS)) install.packages("MASS", dependencies=TRUE) # lda & qda
if (!require(mgcv)) install.packages("mgcv", dependencies=TRUE) # gam()/ nonlinear models
if (!require(splines)) install.packages("splines", dependencies=TRUE) # splines for nonlinear models
if (!require(glmnet)) install.packages("glmnet", dependencies=TRUE) # L1 & L2, cv.glmnet() to automate lambda tuning
if (!require(tree)) install.packages("tree", dependencies=TRUE) # decision tree
if (!require(gmodels)) install.packages("gmodels", dependencies=TRUE) # cross tabulation
if (!require(randomForest)) install.packages("randomForest", dependencies=TRUE) #rf models
if (!require(gbm)) install.packages("gbm", dependencies=TRUE) # boosting
if (!require(BART)) install.packages("BART", dependencies=TRUE) # Bayesian Additive Regression Trees
if (!require(Matrix)) install.packages("Matrix", dependencies=TRUE) # sparse matrix for Lasso coefficients
if (!require(mice)) install.packages("mice", dependencies=TRUE) # imputation
if (!require(pROC)) install.packages("pROC", dependencies=TRUE) # For ROC curves
# Load libraries
library(tidyverse)
library(car)
library(tableone)
library(psych)
library(caret)
library(dplyr)
library(GGally)
library(boot)
library(corrplot)
library(faraway)
library(tidyr)
library(leaps)
library(MASS)
library(mgcv)
library(splines)
library(glmnet)
library(tree)
library(gmodels)
library(randomForest)
library(gbm)
library(BART)
library(Matrix)
library(mice)
library(pROC)
df <- read.csv("/Users/20lau01/Documents/GitHub/Data Science Modelling/Assessment/medical_insurance.csv")
# Understand the data
dim(df)
summary(df)
glimpse(df)
View(df)
sum(is.na(df))
# Factorise categorical variables
cols <- c("children", "sex", "smoker", "region")
df[cols] <- lapply(df[cols], as.factor)
# Make sure charges, age and bmi are numeric
df$charges <- as.numeric(df$charges)
df$age <- as.numeric(df$age)
df$bmi <- as.numeric(df$bmi)
##
## level Overall
## n 1,338
## charges (mean (SD)) 13,270.42 (12,110.01)
## age (mean (SD)) 39.21 (14.05)
## bmi (mean (SD)) 30.66 (6.10)
## sex (%) female 662 (49.5)
## male 676 (50.5)
## smoker (%) no 1064 (79.5)
## yes 274 (20.5)
## children (%) 0 574 (42.9)
## 1 324 (24.2)
## 2 240 (17.9)
## 3 157 (11.7)
## 4 25 ( 1.9)
## 5 18 ( 1.3)
## region (%) northeast 324 (24.2)
## northwest 325 (24.3)
## southeast 364 (27.2)
## southwest 325 (24.3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1122 4740 9382 13270 16640 63770
# Explore transformation for response variables charges
par(mfrow=c(1,1))
symbox(~charges, data=df, col="blue", pch=19) # Log transformation will be done for charges (y)
hist(log(df$charges)) # charges more normally distributed after log transformation
summary(log(df$charges)) # mean = 9.10, median = 9.15
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.023 8.464 9.147 9.099 9.720 11.063
# Create another data frame for the transformed charges and call it dff
dff <- df
dff$log_charges <- log(df$charges)
dff <- dff[ , !(colnames(dff) %in% "charges")]
numeric_vars <- df %>% dplyr::select(-c(smoker, region, sex, children)) # Define numeric variables
cor_matrix <- cor(numeric_vars)# Correlation matrix (numeric variables only)
cor_matrix
## charges age bmi
## charges 1.0000000 0.2990082 0.1983410
## age 0.2990082 1.0000000 0.1092719
## bmi 0.1983410 0.1092719 1.0000000
# Pairwise relationships
GGally::ggpairs(df)
GGally::ggpairs(numeric_vars) # numeric variables only
# Examine numeric variable graphics more clearly
scatterplot(df$bmi, df$charges) # past a bmi 30, the charges increase significantly
scatterplot(df$age, df$charges) # no abrupt change in charges for age
# Statistical testing (Significant results)
# T test for numerical x binary; H0 = the means of the two groups are equal
# Assumes normality so dff and log charges will be used
t.test(dff$log_charges ~ dff$smoker) # Reject H0
# Chi square test; H0: The variables are independent
# smoker x sex
with(df, CrossTable(sex, smoker, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS"))) # Reject H0, Statistically significant, association
# ANOVA
# bmi x region
aov4 <- aov(bmi ~ factor(region), data = dff)
summary(aov4)
# significant difference in bmi between the regions
# charges x children
aov5 <- aov(log_charges ~ factor(children), data = dff)
summary(aov5)
# significant difference in charges between the number of children
Exploratory analysis of the Medical Insurance data set provides important insights into the charges patterns and factors influencing them.
The data set is well-structured with no missing values.
Chargesdata includes individual characteristics like
Age, BMI and Sex as well as
lifestyle indicators such as whether the individual is a
Smoker or not, the geographical region of the
individual, and the number of Children they have. These 6
explanatory variables from 1338 individuals may or may not help predict
an individual’s medical insurance charges.
Outliers in Charges suggest some individuals have
exceptionally high or low charges, which may be due to specific medical
conditions or treatments.
Charges was log transformed to
create a more normal distribution, which may improve model performance
for linear models.
In this data set the distribution of age is centered
around ages 39-40 (mean = 39.2, median = 39). 69 individuals of the 1338
are aged 18, making it the highest concentration category.
There are less individuals who are older and there are 1% more males than females in the data. 79.5% of individuals are non-smokers. This imbalance may affect feature importance, precision of estimates as well as stability for models.
Most people in the data do not have any
children.
There are more people living in the southeast region than any other region, which may affect statistical testing and predictive modeling.
Examining relationships between Charges and other
variables show:
BMI has a low positive Pearson
correlation with Charges (r = 0.13). BMI may not
significantly influence medical insurance. However, there are assumption
violations and further exploration is required as there could be
non-linear relationships.Visual exploration suggest medical charges are
higher for some individuals who have a BMI higher than 30.Age showed moderate positive Pearson
correlations (r = 0.53) with charges, following from
scatterplot findings; older individuals in the data have higher medical
insurance costs.Smoker and non-smokers are
significant, with those who are smokers having a higher
average of medical Charges.Sex does not appear to have a notable impact on
Charges; one sample t test showed the mean difference of
Charges between male and female were not
statistically significant.Children a
person has and their medical Charges.Charges slightly vary between different
Region. Grouped box plots indicate that those living in the
Southeastregion tend to have slightly higher
Charges. ANOVA showed the association between
Region and Charges is not
statistically significant.Several aspects of the data require attention before modeling:
Outliers in Charges: Some individuals have extremely high or low charges. Transforming the data may help reduce their impact, but it can distort the relationship between charges and predictors. The original variable will be used when fitting non-linear models.
Non-Linear Relationships: Relationship between
Charges and BMI may not be linear as
scatterplots suggest. Exploring polynomials and non-linear models like
Random Forest may be beneficial.
Multicollinearity among predictors:
Age with other predictors should be assessed, as it may
affect model performance. Interactions like smoker:sex and
bmi:region could also improve predictive performance,
though their relevance requires cautious validation given earlier
assumption violations in preliminary hypothesis testing.
Medical Insurance Charges of individuals are influenced
by personal and lifestyle factors. The data set
provides valuable predictors. Improvements can be made
by addressing outliers and exploring non-linear
relationships and interaction terms. This will
help build models and render insights that achieve better predictive
accuracy.
# The validation set approach benefits from computational efficiency, but falls short in terms of wasting the data. 10-fold cross-validation approach is more robust, as it uses all data points for training and testing, but is computationally more expensive. Validation set approach is used for more exploratory purposes. Moreover, for more stable performance estimates of statistical learning methods and for final model selection the 10 fold cross-validation approach is used. 10 fold is utilized as it strikes a practical balance in bias-variance trade-off and is a common choice in practice.
set.seed(123) # set seed for reproducibility
trainIndexdff <- createDataPartition(dff$log_charges, p=0.8, list=FALSE, times=1)
train_datadff <- dff[trainIndexdff, ]
test_datadff <- dff[-trainIndexdff, ]
##
## Call:
## lm(formula = log_charges ~ ., data = train_datadff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.06118 -0.19719 -0.04629 0.07727 2.12656
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0469214 0.0812944 86.684 < 2e-16 ***
## age 0.0339824 0.0009778 34.754 < 2e-16 ***
## sexmale -0.0904965 0.0272236 -3.324 0.000917 ***
## bmi 0.0134105 0.0023427 5.724 1.35e-08 ***
## children1 0.1594599 0.0342543 4.655 3.65e-06 ***
## children2 0.2469590 0.0384403 6.424 2.00e-10 ***
## children3 0.2488788 0.0450825 5.521 4.25e-08 ***
## children4 0.4580122 0.1014335 4.515 7.03e-06 ***
## children5 0.3698519 0.1252153 2.954 0.003209 **
## smokeryes 1.5748528 0.0336888 46.747 < 2e-16 ***
## regionnorthwest -0.0840566 0.0393763 -2.135 0.033015 *
## regionsoutheast -0.1602774 0.0387394 -4.137 3.79e-05 ***
## regionsouthwest -0.1256877 0.0396387 -3.171 0.001564 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4435 on 1059 degrees of freedom
## Multiple R-squared: 0.7707, Adjusted R-squared: 0.7681
## F-statistic: 296.7 on 12 and 1059 DF, p-value: < 2.2e-16
## RMSE of the full model: 8314.693
# Baseline model for comparison
set.seed(123)
ctrl <- trainControl(method = "cv", number = 10)
mlr_model <- train(charges ~ .,
data = df,
method = "lm",
trControl = ctrl)
summary(mlr_model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11689.4 -2902.6 -943.7 1492.2 30042.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11927.17 993.66 -12.003 < 2e-16 ***
## age 257.19 11.91 21.587 < 2e-16 ***
## bmi 336.91 28.61 11.775 < 2e-16 ***
## children1 390.98 421.35 0.928 0.353619
## children2 1635.78 466.67 3.505 0.000471 ***
## children3 964.34 548.10 1.759 0.078735 .
## children4 2947.37 1239.16 2.379 0.017524 *
## children5 1116.04 1456.02 0.767 0.443514
## sexmale -128.16 332.83 -0.385 0.700254
## smokeryes 23836.41 414.14 57.557 < 2e-16 ***
## regionnorthwest -380.04 476.56 -0.797 0.425318
## regionsoutheast -1033.14 479.14 -2.156 0.031245 *
## regionsouthwest -952.89 478.15 -1.993 0.046483 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6059 on 1325 degrees of freedom
## Multiple R-squared: 0.7519, Adjusted R-squared: 0.7497
## F-statistic: 334.7 on 12 and 1325 DF, p-value: < 2.2e-16
mlr_rmse_cv <- mlr_model$results$RMSE # RMSE of the model
cat("RMSE of the model using 10-fold cross-validation:", mlr_rmse_cv, "\n")
## RMSE of the model using 10-fold cross-validation: 6083.203
The model explains 75% of the variance in the data, with
smoker being the most significant
predictor.
The model indicates:
charges that are $23,836
higher than non-smokers, cet.par.age is associated with a $257
increase in charges on average, cet.par.bmi increases
charges by $337 on average, cet.par.set.seed(123)
trainIndex <- createDataPartition(df$charges, p=0.8, list=FALSE, times=1)
train_data <- df[trainIndex, ]
test_data <- df[-trainIndex, ]
# Fit a model with interaction terms
interaction_model <- lm(log_charges ~ sex * smoker + age * bmi + age * smoker + bmi * smoker + region * smoker + bmi * region, data = train_datadff)
summary(interaction_model)
##
## Call:
## lm(formula = log_charges ~ sex * smoker + age * bmi + age * smoker +
## bmi * smoker + region * smoker + bmi * region, data = train_datadff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81074 -0.18724 -0.06788 0.07532 2.35683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.083e+00 2.153e-01 32.894 < 2e-16 ***
## sexmale -1.164e-01 2.782e-02 -4.185 3.09e-05 ***
## smokeryes 1.149e+00 1.834e-01 6.265 5.43e-10 ***
## age 4.443e-02 4.607e-03 9.643 < 2e-16 ***
## bmi 7.676e-03 7.002e-03 1.096 0.2732
## regionnorthwest -2.232e-01 1.885e-01 -1.184 0.2368
## regionsoutheast 1.480e-01 1.779e-01 0.832 0.4056
## regionsouthwest -1.851e-01 1.924e-01 -0.962 0.3362
## sexmale:smokeryes 1.164e-01 6.243e-02 1.864 0.0626 .
## age:bmi -9.966e-05 1.456e-04 -0.684 0.4939
## smokeryes:age -3.259e-02 2.206e-03 -14.777 < 2e-16 ***
## smokeryes:bmi 4.979e-02 5.275e-03 9.437 < 2e-16 ***
## smokeryes:regionnorthwest 7.941e-02 8.998e-02 0.883 0.3777
## smokeryes:regionsoutheast 8.579e-02 8.536e-02 1.005 0.3151
## smokeryes:regionsouthwest 1.695e-01 9.205e-02 1.841 0.0659 .
## bmi:regionnorthwest 4.839e-03 6.299e-03 0.768 0.4426
## bmi:regionsoutheast -9.360e-03 5.554e-03 -1.685 0.0923 .
## bmi:regionsouthwest 5.363e-04 6.244e-03 0.086 0.9316
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4028 on 1054 degrees of freedom
## Multiple R-squared: 0.8118, Adjusted R-squared: 0.8088
## F-statistic: 267.4 on 17 and 1054 DF, p-value: < 2.2e-16
# Significant:
# smoker: age
# smoker: bmi
# bmi:regionsoutheast is significant (.02) but other regions are not and EDA also suggests
# that we should ignore this interaction
set.seed(123)
ctrl <- trainControl(method = "cv", number = 10)
# Fit a multiple linear regression model with interaction terms
mlr_modelIT <- train(charges ~ .+ smoker:age + smoker:bmi,
data = df,
method = "lm",
trControl = ctrl)
# RMSE of the model
mlr_rmse_cvIT <- mlr_modelIT$results$RMSE
cat("RMSE of the model with interaction terms using 10-fold cross-validation:", mlr_rmse_cvIT, "\n")
## RMSE of the model with interaction terms using 10-fold cross-validation: 4853.675
## Best number of variables according to adjusted R-squared: 11
## Best number of variables according to Cp: 10
## Best number of variables according to BIC: 3
# Coefficients of the best model according to BIC
coef(regfit.full, 3)
## (Intercept) age smokeryes bmi:smokeryes
## -1934.3416 260.9886 -21353.7046 1477.9478
## RMSE of the best subset model using BIC: 5256.636
## Best model using 1-SE rule: 2
## RMSE of the best subset model using 1-SE rule: 5239.712
## (Intercept) age bmi:smokeryes
## -2252.6938 263.8284 818.4120
# Scale the numeric columns of the data using mean and standard deviation
set.seed(1)
numeric_columns <- names(df)[sapply(df, is.numeric)]
means <- colMeans(df[, numeric_columns])
sds <- apply(df[, numeric_columns], 2, sd)
# Create a data frame for scaled data
df_scaled <- df
df_scaled[, numeric_columns] <- sweep(df[, numeric_columns], 2, means, "-")
df_scaled[, numeric_columns] <- sweep(df_scaled[, numeric_columns], 2, sds, "/")
##
## Call: cv.glmnet(x = X, y = y, nfolds = 10, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.00204 65 0.1625 0.009744 13
## 1se 0.04829 31 0.1717 0.010313 3
## Optimal lambda using 10-fold cross-validation: 0.002042173
## 15 x 1 sparse Matrix of class "dgCMatrix"
## unstandardized_coef
## (Intercept) -1.268029852
## age 0.021713234
## bmi 0.001434399
## children1 0.016241252
## children2 0.115133355
## children3 0.071736829
## children4 0.262040891
## children5 0.124442091
## sexmale -0.036249941
## smokeryes 1.962638567
## regionnorthwest -0.034215475
## regionsoutheast -0.082548719
## regionsouthwest -0.085481037
## age:smokeryes .
## bmi:smokeryes 0.722569448
## RMSE of the LASSO model with interaction terms using 10-fold cross-validation: 4816.436
##
## Regression tree:
## tree(formula = charges ~ ., data = train_data)
## Variables actually used in tree construction:
## [1] "smoker" "age" "bmi"
## Number of terminal nodes: 5
## Residual mean deviance: 22920000 = 2.446e+10 / 1067
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8907.0 -2920.0 -958.4 0.0 1013.0 24390.0
## RMSE of the Decision Tree model: 5316.753
## RMSE of the Pruned Decision Tree model: 6313.892
## Selected cp using 1-SE rule: 0.06
## Final RMSE of the Decision Tree model using 10-fold cross-validation: 5082.757
Boosting is used as another method for variable selection as unlike LASSO and best subset selection, it can model non-linear relationships and capture interactions well.
## RMSE of the Boosting model: 4993.59
## RMSE of the Boosting model using 10-fold cross-validation: 4480.496
## RMSE of the Random Forest model: 4665.599
## RMSE of the Random Forest model with interaction terms: 4596.214
The importance differs slightly as the interaction terms are treated as more important. Age is treated as rather than smoker when the interactions are considered explicitly.
set.seed(123)
k <- 10
folds <- sample(1:k, nrow(df), replace = TRUE)
rmse_list <- numeric(k)
for (i in 1:k) {
train_idx <- which(folds != i)
test_idx <- which(folds == i)
bart_cvmodel <- gbart(x[train_idx, ], y[train_idx],
x.test = x[test_idx, ],
ntree = 50, k = 2)
rmse_list[i] <- sqrt(mean((y[test_idx] - bart_cvmodel$yhat.test.mean)^2))
}
## 10-Fold CV RMSE (gbart): 4446.968
## Best Performing Model (RMSE): BART
ggplot(model_performance, aes(x = reorder(Model, -RMSE), y = RMSE, fill = (Model == best_model))) +
geom_bar(stat = "identity", alpha = 0.7) +
scale_fill_manual(values = c("gray", "darkorange")) +
geom_text(aes(label = round(RMSE, 2)), vjust = -0.5, size = 4, fontface = "bold") +
labs(
title = "Model Performance Comparison (RMSE)",
y = "Mean Squared Error (RMSE)",
x = "Model"
) +
theme_minimal() +
theme(legend.position = "none")
After comparing the regression models using Root Mean Squared Error (RMSE), we can make several observations about the predictive performance of each model for medical insurance charges.
Smoker is the most important in predicting
charges when examining main effects alone,
followed by age and bmi.
When interactions are considered, the models suggest
age and smoker:bmi may be more important than
smoker alone.
The best-performing model, with the lowest RMSE is highlighted in orange in the final plot.
Tree based models (e.g.Boosting, Random Forest and BART) outperformed linear models, though linear models with interaction terms surpassed decision trees and had a lower RMSE than basic linear models. This suggests non-linear relationships in the data, making simple linear model inadequate.
BART had the lowest RMSE, this suggests that there may be more complex relationships in the data such as higher order interactions e.g. between smoker, age and BMI and other variables.
Multivariate Linear Regression: Provides a simple baseline model, but simple least squares is too flexible, over fits the data and interactions or non-linear trends may not be captured.
Regression with Interaction Terms: Helps capture dependencies between predictors, but increased model complexity, does not always improve accuracy since simple least squares still over fits the data.
Polynomial Regression: Did not effectively capture non-linear relationships and suggested that relationships are not smooth global curves.
Best Subset Regression(BREG) with Interaction terms:
age and smoker:bmi as the top predictors.
Smoking impacts charges significantly varies with BMI (and vice versa).
The joint effect explains for more variance than either alone.LASSO: Provided marginal improvement over BREG.
Weak lambda_min penalty resulted in a model too weak to
enforce sparsity (many variables retained) and high variance (potential
noise). The least important variable was
age:smoker.
Random Forest: Achieved strong predictive accuracy, ranking second only to BART. It outperforms both decision trees and boosting methods.
Tree Based Methods had the lowest MSE:
→ Medical Charges depend on complex non-linear, non-additive
relationships.
Next Steps:
- It would have been reasonable to explore step
functions e.g. for bmi over 30 as there may be a
threshold effect. For instance, smokers who have a bmi over
30 likely have higher medical charges.
- Improve feature engineering, remove collinear
variables, and better model smoker*bmi.
- Explore unsupervised methods e.g. cluster analysis to
better understand bmi:smoker.
- Try advanced models like XGBoost and Hybrid Models to
trade interpretability for improved accuracy.
This analysis provides a strong foundation for medical charges forecasting, but further refinements could lead to even better predictive performance.
# T-tests
t.test(dff$log_charges ~ dff$sex)
t.test(dff$age ~ dff$smoker)
t.test(dff$age ~ dff$sex)
# Chi-squared tests
# Region x smoker
with(df, CrossTable(region, smoker, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Close to 0.05, but not statistically significant
# Region x children
with(df, CrossTable(region, children, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Sex x region
with(df, CrossTable(sex, region, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
# Children x sex
with(df, CrossTable(sex, children, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
# Children x smoker
with(df, CrossTable(smoker, children, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, but note there are less than 5 expected counts in some cells
# ANOVA
# age x children
aov1 <- aov(age ~ factor(children), data = dff)
summary(aov1)
par(mfrow=c(2,2))
plot(aov1) # check residuals to decide on weight of resulting H0 conclusion
# no significant difference in age between the number of children
# age x region
aov2 <- aov(age ~ factor(region), data = dff)
summary(aov2)
par(mfrow=c(2,2))
plot(aov2)
# no significant difference in age between the regions
# bmi x children
aov3 <- aov(bmi ~ factor(children), data = dff)
summary(aov3)
par(mfrow=c(2,2))
plot(aov3)
# no significant difference in bmi between the number of children
# charges x region
aov6 <- aov(log_charges ~ factor(region), data = dff)
summary(aov6)
par(mfrow=c(2,2))
plot(aov6)
# no significant difference in charges between the different regions
# Explore Polynomials
fit1 <- lm(charges ~ age, data = df)
fit2 <- lm(charges ~ poly(age, 2), data = df)
fit3 <- lm(charges ~ poly(age, 3), data = df)
fit4 <- lm(charges ~ poly(age, 4), data = df)
AIC(fit1, fit2, fit3, fit4)
anova(fit1, fit2, fit3, fit4)
# Complex models hasn't improved the model much, linear model is simpler
fit1 <- lm(charges ~ bmi, data = df)
fit2 <- lm(charges ~ poly(bmi, 2), data = df)
fit3 <- lm(charges ~ poly(bmi, 3), data = df)
fit4 <- lm(charges ~ poly(bmi, 4), data = df)
AIC(fit1, fit2, fit3, fit4)
anova(fit1, fit2, fit3, fit4)
# Polynomial does not seem beneficial
df2 <- read.csv("/Users/20lau01/Documents/GitHub/Data Science Modelling/Assessment/stroke_prediction.csv")
# Explore dataset
dim(df2)
summary(df2)
glimpse(df2)
View(df2)
## [1] "gender" "age" "hypertension" "heartdisease"
## [5] "evermarried" "worktype" "residencetype" "avgglucoselevel"
## [9] "bmi" "smokingstatus" "stroke"
# Understand Missing
# Age
min(df2$age)
sum(df2$age == 0.08)
sum(is.na(df2$age)) # Check for missing values
# BMI
sum(df2$bmi== "N/A")
sum(is.na(df2$bmi))
class(df2$bmi)
# Smoking
sum(df2$smokingstatus == "Unknown")
sum(is.na(df2$smokingstatus))
# Check total missing values
sum(is.na(df2))
df2$age[df2$age == 0.08] <- NA # Replace 0.08 with NA
df2$bmi[df2$bmi == "N/A"] <- NA # Replace N/A with NA
df2$smokingstatus[df2$smokingstatus == "Unknown"] <- NA
# Factorise
cols <- c("stroke", "gender", "evermarried", "hypertension", "heartdisease", "worktype", "residencetype", "smokingstatus")
df2[cols] <- lapply(df2[cols], as.factor)
# Recode binary variables
levels(df2$heartdisease) <- c("No", "Yes")
levels(df2$stroke) <- c("No", "Yes")
# Reorder levels for smoking status
df2$smokingstatus <- factor(df2$smokingstatus, levels = c("never smoked", "formerly smoked", "smokes"), ordered = TRUE)
# Remove the One "Other" from gender for simplicity for modelling
df2 <- df2[df2$gender != "Other",]
df2$gender <- droplevels(df2$gender)
df2$gender <- relevel(df2$gender, ref = "Male") # Reorder level for gender
## stroke
## No Yes
## 4860 249
## evermarried
## No Yes
## 1756 3353
## hypertension
## 0 1
## 4611 498
## heartdisease
## No Yes
## 4833 276
## residencetype
## Rural Urban
## 2513 2596
## gender
## Male Female
## 2115 2994
## worktype
## children Govt_job Never_worked Private Self-employed
## 687 657 22 2924 819
## smokingstatus
## never smoked formerly smoked smokes
## 1892 884 789
##
## level Overall
## n 5,109
## stroke (%) No 4860 (95.1)
## Yes 249 ( 4.9)
## bmi (mean (SD)) 28.89 (7.85)
## age (mean (SD)) 43.23 (22.61)
## avgglucoselevel (mean (SD)) 106.14 (45.29)
## evermarried (%) No 1756 (34.4)
## Yes 3353 (65.6)
## hypertension (%) 0 4611 (90.3)
## 1 498 ( 9.7)
## heartdisease (%) No 4833 (94.6)
## Yes 276 ( 5.4)
## residencetype (%) Rural 2513 (49.2)
## Urban 2596 (50.8)
## worktype (%) children 687 (13.4)
## Govt_job 657 (12.9)
## Never_worked 22 ( 0.4)
## Private 2924 (57.2)
## Self-employed 819 (16.0)
## smokingstatus (%) never smoked 2844 (55.7)
## formerly smoked 1195 (23.4)
## smokes 1070 (20.9)
## gender (%) Male 2115 (41.4)
## Female 2994 (58.6)
cor_matrix2 <- cor(complete_data[, c("age", "bmi", "avgglucoselevel")])
corrplot(cor_matrix2, method="color", tl.col="black", tl.srt=45)
GGally::ggpairs(complete_data[, c("age", "bmi", "avgglucoselevel")]) # note the violations in assumptions of pearson correlation (variables arent normal)
# grouped boxplots
par(mfrow=c(1,3))
boxplot(complete_data$age ~ complete_data$stroke, col = c("#1f77b4", "#ff7f0e"), main = "Boxplot of Age by Stroke Status")
boxplot(complete_data$bmi ~ complete_data$stroke, col = c("#1f77b4", "#ff7f0e"), main = "Boxplot of BMI by Stroke Status")
boxplot(complete_data$avgglucoselevel ~ complete_data$stroke, col = c("#1f77b4", "#ff7f0e"), main = "Boxplot of Avg Glucose Level by Stroke Status")
t.test(complete_data$age ~ complete_data$stroke) # Reject H0, Statistically significant
##
## Welch Two Sample t-test
##
## data: complete_data$age by complete_data$stroke
## t = -29.678, df = 331.61, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -27.45537 -24.04196
## sample estimates:
## mean in group No mean in group Yes
## 41.97953 67.72819
t.test(complete_data$bmi ~ complete_data$stroke) # Reject H0, Statistically significant
##
## Welch Two Sample t-test
##
## data: complete_data$bmi by complete_data$stroke
## t = -3.6374, df = 237.84, p-value = 0.0003377
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -2.5387991 -0.7549231
## sample estimates:
## mean in group No mean in group Yes
## 28.82443 30.47129
t.test(complete_data$avgglucoselevel ~ complete_data$stroke) # Reject H0, Statistically significant
##
## Welch Two Sample t-test
##
## data: complete_data$avgglucoselevel by complete_data$stroke
## t = -6.9844, df = 260.9, p-value = 2.373e-11
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -35.58269 -19.93162
## sample estimates:
## mean in group No mean in group Yes
## 104.7876 132.5447
# Stroke x ever married
with(complete_data, CrossTable(evermarried, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
# Association between stroke and ever married
# Stroke x Hypertension
with(complete_data, CrossTable(hypertension, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
# Association between stroke and hypertension
# Stroke x heart disease
with(complete_data, CrossTable(heartdisease, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
# Association between stroke and heart disease
# Stroke associated with work type
with(complete_data, CrossTable(worktype, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
# Association between stroke and work type
# Expected value for never worked and has stroke is under 5, chi square of independence assumption IS VIOLATED
# Stroke associated with smoking status?
with(complete_data, CrossTable(smokingstatus, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
# Association between stroke and smoking status
# T test
t.test(complete_data$age ~ complete_data$gender, var.equal = TRUE) # p = 0.05, Reject H0- statistically significant but age: gender may be useless in models
t.test(complete_data$age ~ complete_data$hypertension, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$age ~ complete_data$heartdisease, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$age ~ complete_data$evermarried, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$bmi ~ complete_data$hypertension, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$bmi ~ complete_data$heartdisease, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$bmi ~ complete_data$evermarried, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$avgglucoselevel ~ complete_data$gender, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$avgglucoselevel ~ complete_data$hypertension, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$avgglucoselevel ~ complete_data$heartdisease, var.equal = TRUE) # Reject H0, Statistically significant
t.test(complete_data$avgglucoselevel ~ complete_data$evermarried, var.equal = TRUE) # Reject H0, Statistically significant
# Chi-square tests
with(complete_data, CrossTable(worktype, gender, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
with(complete_data, CrossTable(evermarried, hypertension, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
with(complete_data, CrossTable(hypertension, heartdisease, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant-> verifies what we suspected from ggpairs
with(complete_data, CrossTable(smokingstatus, heartdisease, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
with(complete_data, CrossTable(hypertension, worktype, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant -> less than 5 on expected values
with(complete_data, CrossTable(worktype, evermarried, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
with(complete_data, CrossTable(residencetype, smokingstatus, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
with(complete_data, CrossTable(heartdisease, worktype, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
with(complete_data, CrossTable(evermarried, heartdisease, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
with(complete_data, CrossTable(heartdisease, gender, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
with(complete_data, CrossTable(smokingstatus, gender, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
with(complete_data, CrossTable(worktype, smokingstatus, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
with(complete_data, CrossTable(gender, evermarried, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Reject H0, Statistically significant
Exploratory analysis of the Stroke data set revealed several important patterns and potential data quality issues:
BMI has missing data and the “Unknown” level in
smokingstatus is treated as missing.BMI and
Age are numerical variables, predictive mean matching
imputation is used as it is a common method that is robust for numerical
variables. Random Forest imputation for smokingstatus is
used as the Stroke data set contains a mix of categorical and numerical
variables and prioritizes accuracy and accommodates for more complex
relationships .gender, to ensure consistent levels,
Other was removed as there is only one observation.The data set contains more non-stroke cases
(stroke = 0) than stroke cases (stroke = 1),
which can impact model performance.
Histograms indicate extreme skewness in
avgglucoselevel and BMI, which may need to be
log transformed as model performance can
suffer.
age,
avgglucoselevel, and BMI for LDA/QDA or
regularized logistic regression would improve model performanceThose who have stroke have are typically older in
age and fall into generally two clusters of
avgglucoselevel
The average bmi of those who have
stroke is 30 as opposed to 28 who do not have
stroke, whereas the average age of those who
have stroke is 67 as opposed to 41 who do not have. The
avgglucoselevel of those who have stroke is
132 as opposed to 104 who do not.
Chi-squared test indicated there is no statistically significant
association between stroke with gender or
residencetype, but there is a statistically significant
association between stroke and evermarried,
hypertension, heartdisease,
worktype, and smokingstatus.
Grouped scatter plots and multi-collinearity statistical tests indicate that various interaction terms will need to be considered when modelling as basic logistic regression models would suffer from high multicollinearity. The following interactions require further analysis as to their significance:
age:hypertension, age:heartdisease,
age:evermarried,
age:averageglucoselevel
bmi:hypertension, bmi:evermarried,
bmi:heartdisease
avgglucoselevel:hypertension,
avgglucoselevel:heartdisease,
avgglucoselevel:evermarried
worktype:evermarried,
worktype:heartdisease, worktype:hypertension,
worktype:smokingstatus
smokingstatus:evermarried,
smokingstatus:heartdisease
hypertension:evermarried,
hypertension:heartdisease
This section applies Logistic Regression,
Linear Discriminant Analysis(LDA) , Quadratic
Discriminant Analysis(QDA) and Random Forest
to predict whether a patient has stroke.
# For plausible interactions, test individually:
# Note the following are those that are significant, for interactions that have been tested and were not significant- please see the appendix
fit_int2 <- glm(stroke ~ age*heartdisease, data=complete_data, family = "binomial")
summary(fit_int2)
# statistically significant
fit_int5 <- glm(stroke ~ bmi*hypertension, data=complete_data, family = "binomial")
summary(fit_int5)
# statistically significant
fit_int7 <- glm(stroke ~ bmi*evermarried, data=complete_data, family = "binomial")
summary(fit_int7)
# statistically significant, interaction more significant than bmi main effect
fit_int8 <- glm(stroke ~ avgglucoselevel*hypertension, data=complete_data, family = "binomial")
summary(fit_int8)
# statistically significant
fit_int11 <- glm(stroke ~ worktype*evermarried, data=complete_data, family = "binomial")
summary(fit_int11)
# statistically significant for worktypePrivate:evermarriedYes (but not for others)
fit_int16 <- glm(stroke ~ smokingstatus*heartdisease, data=complete_data, family = "binomial")
summary(fit_int16)
# statistically significant (threshold of 0.05)
fit_int17 <- glm(stroke ~ hypertension*evermarried, data=complete_data, family = "binomial")
summary(fit_int17)
# statistically significant
fit_int18 <- glm(stroke ~ hypertension*heartdisease, data=complete_data, family = "binomial")
summary(fit_int18)
# statistically significant
age:heartdisease,
bmi:hypertension,bmi:evermarriedavgglucoselevel:hypertension,
smokingstatus:heartdiseasehypertension:evermarried,
hypertension:heartdisease##
## Call:
## glm(formula = stroke ~ age + hypertension + heartdisease + avgglucoselevel +
## smokingstatus + evermarried + age:heartdisease + hypertension:evermarried +
## hypertension:heartdisease, family = "binomial", data = complete_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.207324 0.485267 -16.913 < 2e-16 ***
## age 0.070237 0.006192 11.343 < 2e-16 ***
## hypertension1 1.737507 0.496452 3.500 0.000466 ***
## heartdiseaseYes 2.633869 1.401689 1.879 0.060235 .
## avgglucoselevel 0.004925 0.001260 3.908 9.29e-05 ***
## smokingstatusformerly smoked 0.117366 0.172033 0.682 0.495093
## smokingstatussmokes 0.421086 0.197419 2.133 0.032928 *
## evermarriedYes 0.230015 0.308296 0.746 0.455616
## age:heartdiseaseYes -0.029095 0.019625 -1.483 0.138196
## hypertension1:evermarriedYes -1.192216 0.517820 -2.302 0.021314 *
## hypertension1:heartdiseaseYes -0.671801 0.444982 -1.510 0.131113
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1728.3 on 4907 degrees of freedom
## Residual deviance: 1360.7 on 4897 degrees of freedom
## AIC: 1382.7
##
## Number of Fisher Scoring iterations: 8
## age hypertension1
## 95.65685 100.94144
## heartdiseaseYes avgglucoselevel
## 453.79200 15.37840
## smokingstatusformerly smoked smokingstatussmokes
## 25.66308 31.21740
## evermarriedYes age:heartdiseaseYes
## 105.72878 427.46181
## hypertension1:evermarriedYes hypertension1:heartdiseaseYes
## 99.41058 11.34878
age, hypertension,
heartdisease, avgglucoselevel,
smokingstatus, evermarried, and
bmi.
Interaction terms age:heartdisease,
smokingstatus:heartdisease, and
hypertension:evermarried were also selected
Interactions that were not statistically significant were omitted.
## (Intercept)
## -8.161300874
## age
## 0.070380992
## hypertension1
## 1.591438888
## heartdiseaseYes
## 2.202423520
## avgglucoselevel
## 0.004988396
## `smokingstatusformerly smoked`
## 0.070818750
## smokingstatussmokes
## 0.310328425
## evermarriedYes
## 0.238959131
## `age:heartdiseaseYes`
## -0.028988471
## `heartdiseaseYes:smokingstatusformerly smoked`
## 0.266659370
## `heartdiseaseYes:smokingstatussmokes`
## 0.513755102
## `hypertension1:evermarriedYes`
## -1.181582964
A one year increase in age, increases the log odds
of having a stroke by 0.07132, ceteris
paribus.
For one mg/dl increase in avgglucoselevel, the log
odds of having a stroke increase by 0.00425, ceteris
paribus.
For those with hypertension, the log odds of having
a stroke is 1.34583 times higher than those without
hypertension, ceteris paribus.
For those with heartdisease, the log odds of having
a stroke is 2.08845 times higher than those without heart
disease, ceteris paribus.
For those who formerly smoked, the log odds of having a
stroke is 0.12441 times higher than those who never smoked,
ceteris paribus.
For those who smoke, the log odds of having a stroke
is 0.10365 times higher than those who never smoked, ceteris
paribus.
For those who have evermarried, the log odds of
having a stroke is 0.07084 times higher than those who
never been married, ceteris paribus.
## Area under the curve: 0.8434
## Best lamda (smallest cv-error): 0.002177595
## 23 x 1 sparse Matrix of class "dgCMatrix"
## unstandardized_coef
## (Intercept) -7.330086028
## genderFemale .
## age 0.063732267
## hypertension1 0.544887569
## heartdiseaseYes 0.194898669
## evermarriedYes .
## worktypeGovt_job .
## worktypeNever_worked .
## worktypePrivate 0.050503883
## worktypeSelf-employed -0.194880821
## residencetypeUrban .
## avgglucoselevel 0.004392215
## bmi .
## smokingstatusformerly smoked .
## smokingstatussmokes 0.049396449
## age:heartdiseaseYes .
## hypertension1:bmi .
## evermarriedYes:bmi .
## hypertension1:avgglucoselevel .
## heartdiseaseYes:smokingstatusformerly smoked 0.199882587
## heartdiseaseYes:smokingstatussmokes 0.668436163
## hypertension1:evermarriedYes .
## hypertension1:heartdiseaseYes -0.259171998
For a one year increase in age, the log odds of
having a stroke increases by 0.064, ceteris
paribus.
For a one mg/dl increase in avgglucoselevel, the log
odds of having a stroke increase by 0.00357, ceteris
paribus.
For those with hypertension, the log odds of having
a stroke is 0.545 units higher compared to those without
hypertension, ceteris paribus.
For those with heartdisease, the log odds of having
a stroke is 0.195 times higher compared to those without
heart disease, ceteris paribus.
For those who have heartdisease and
formerly smoked, the log odds of having stroke
is 0.200 units higher compared to if only the individual effects of
heartdisease and never smoked were added
alone, ceteris paribus.
For those who have heartdisease and smokes, the
additional effect on the log odds of having stroke is 0.668
units higher than would be than only adding the individual effects of
heart disease and neversmoked alone,
ceteris paribus.
For individuals with a worktype of
Self-employed, the log odds of having a stroke
is -0.195 lower compared to those with a worktype of
‘Children’, ceteris paribus.
For individuals with a ‘worktype’ of Private, the
log odds of having a stroke is 0.051 units higher than
would be only adding the individual effects of a worktype
of ‘Children’, ceteris paribus.
## Area under the curve: 0.8528
## Linear Discriminant Analysis
##
## 4908 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4417, 4417, 4417, 4417, 4417, 4417, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8329498 0.9778646 0.09595238
## Quadratic Discriminant Analysis
##
## 4908 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4417, 4417, 4417, 4419, 4417, 4417, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8192788 0.9091335 0.387381
## QDA AUC: 0.8161092
oob_results <- sapply(c(2, 3, 4, 5), function(m) {
model <- randomForest(
formula,
data = complete_data,
mtry = m,
strata = complete_data$stroke, # Balance class splits
ntree = 500
)
return(mean(model$err.rate[, "OOB"])) # Average OOB error
})
best_mtry <- c(2, 3, 4, 5)[which.min(oob_results)]
set.seed(123)
rf_model <- randomForest(
formula,
data = complete_data,
mtry = best_mtry, # Optimal mtry from earlier
ntree = 1000, # Sufficiently large to observe stabilization
strata = complete_data$stroke, # For class imbalance
sampsize = rep(min(table(complete_data$stroke)), 2), # Balanced samples
importance = TRUE
)
# When the OOB error stabilizes, it indicates that the model has learned the data well and is not overfitting
# The below plot shows the OOB error for the "Yes" class
oob_yes <- rf_model$err.rate[, "Yes"]
plot(oob_yes, type = "l", col = "red", ylab = "OOB Error (Yes)")
abline(h = min(oob_yes), col = "blue", lty = 2) # Add a horizontal line for the minimum OOB error
cat("Minimum OOB error:", min(oob_yes), "\n")
## Minimum OOB error: 0.1866029
rf_model <- train(
formula,
data = complete_data,
method = "rf",
trControl = ctrl,
metric = "ROC",
tuneGrid = data.frame(mtry = best_mtry),
ntree = 200 # Number of trees
)
rf_model
## Random Forest
##
## 4908 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4417, 4417, 4417, 4418, 4418, 4417, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7541633 1 0
##
## Tuning parameter 'mtry' was held constant at a value of 2
cv_roc_rf <- roc(
response = rf_model$pred$obs,
predictor = rf_model$pred$Yes, # Use "Yes" or the appropriate level name
levels = c("No", "Yes") # Must match your factor levels
)
final_rf <- rf_model$finalModel
# Plot variable importance
varImpPlot(final_rf)
par(mfrow=c(1,1))
plot(cv_roc_log, col="red", main="ROC Curve Comparison")
lines(cv_roc_lasso, col="orange")
lines(cv_roc_lda, col="blue")
lines(cv_roc_qda, col="green")
lines(cv_roc_rf, col="purple")
legend("bottomright", legend=c("Logistic", "LASSO", "LDA", "QDA", "Random Forest"), col=c("red", "blue", "green", "purple"), lwd=2)
auc_values <- c(
"Fwd_Step_Logit" = auc(cv_roc_log),
"LASSO" = auc(cv_roc_lasso),
"LDA" = auc(cv_roc_lda),
"QDA" = auc(cv_roc_qda),
"Random Forest" = auc(cv_roc_rf)
)
auc_values
## Fwd_Step_Logit LASSO LDA QDA Random Forest
## 0.8433760 0.8527570 0.8311256 0.8161092 0.7533915
# Print model name with the highest AUC
best_model <- names(which.max(auc_values))
cat("Best Model(Highest-AUC):", best_model, "\n")
## Best Model(Highest-AUC): LASSO
# All perform similarly, feature engineering may be needed to improve classification
LASSO had the highest ROC curve (largest AUC) : Suggests that identifying a small subset of features is beneficial for classification.
A sparse linear boundary was enough for classification.
L1 regularization effectively handles multicollinearity and can help reduce over-fitting and as it performs feature selection by shrinking some coefficients to zero.
Stepwise Logistic Regression had a similar ROC curve to LASSO: A simple linear boundary is sufficient for classification, meaning the stroke data is linearly separable.
LDA does not perform better than Logistic Regression: Although the assumption of normally distributed data holds well, but the assumption of homogenity of covariance matrices does not. LDA is not the most suitable classifier.
QDA neither outperforms LDA or Logistic Regression: Suggests that the relationship between features and stroke is not non-linear, and QDA’s ability to model curved decision boundaries is not advantageous.
Random Forest has the lowest AUC: Suggests that the model underperforms due to its inability to tackle multi-collinearity. The model is ineffective as it does not penalize redundant features and is sensitivity to noise. However, it is still useful for assessing feature importance.
All models perform similarly: Feature engineering or additional predictors might be needed to improve classification.
Key Takeaways: Although Random forest suggested the
most important predictors of stroke are age,
avgglucoselevel and bmi, LASSO favors
heartdisease and hypertension predictors. The
best model depends on AUC, ROC-curve, how the model performs
feature selection and meets underlying assumptions.
Handling class imbalance can improve performance.
Pooling data across all imputed data sets would have
provided better generalization and more robust standard
error estimates.
# Stroke x Gender
with(complete_data, CrossTable(gender, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically signficiant
# No association between stroke and gender
# Stroke x residence type
with(complete_data, CrossTable(residencetype, stroke, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
# No association between stroke and residence type
# T-tests
t.test(complete_data$age ~ complete_data$residencetype, var.equal = TRUE) # p = 0.3, fail to reject H0
t.test(complete_data$bmi ~ complete_data$gender, var.equal = TRUE) # p = 0.1, fail to reject H0
t.test(complete_data$bmi ~ complete_data$residencetype, var.equal = TRUE) # p = 1, fail to reject H0
t.test(complete_data$avgglucoselevel ~ complete_data$residencetype, var.equal = TRUE) # Fail to reject H0, p = 0.7
# Chi-square
with(complete_data, CrossTable(hypertension, smokingstatus, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
with(complete_data, CrossTable(gender, residencetype, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
with(complete_data, CrossTable(residencetype, evermarried, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
with(complete_data, CrossTable(gender, hypertension, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
with(complete_data, CrossTable(residencetype, hypertension, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
with(complete_data, CrossTable(residencetype, worktype, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
with(complete_data, CrossTable(residencetype, heartdisease, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
with(complete_data, CrossTable(smokingstatus, hypertension, prop.chisq = FALSE, prop.c = FALSE, prop.t = FALSE, asresid = TRUE, expected = T, format = c("SPSS")))
# Fail to reject H0, Not statistically significant
fit_int <- glm(stroke ~ age*hypertension, data=complete_data, family = "binomial")
summary(fit_int)
# age:hypertension
fit_int3 <- glm(stroke ~ age*evermarried, data=complete_data, family = "binomial")
summary(fit_int3)
# age:evermarried
fit_int4 <- glm(stroke ~ age*avgglucoselevel, data=complete_data, family = "binomial")
summary(fit_int4)
# age:avgglucoselevel
fit_int6 <- glm(stroke ~ bmi*heartdisease, data=complete_data, family = "binomial")
summary(fit_int6)
# bmi:heartdisease
fit_int9 <- glm(stroke ~ avgglucoselevel*heartdisease, data=complete_data, family = "binomial")
summary(fit_int9)
# avgglucoselevel:heartdisease
fit_int10 <- glm(stroke ~ avgglucoselevel*evermarried, data=complete_data, family = "binomial")
summary(fit_int10)
# avgglucoselevel:evermarried
fit_int12 <- glm(stroke ~ worktype*heartdisease, data=complete_data, family = "binomial")
summary(fit_int12)
# worktype:heartdisease
fit_int13 <- glm(stroke ~ worktype*hypertension, data=complete_data, family = "binomial")
summary(fit_int13)
# worktype:hypertension
fit_int14 <- glm(stroke ~ worktype*smokingstatus, data=complete_data, family = "binomial")
summary(fit_int14)
# worktype:smokingstatus
fit_int15 <- glm(stroke ~ smokingstatus*evermarried, data=complete_data, family = "binomial")
summary(fit_int15)
# smokingstatus:evermarried
# Consider three way order interactions too -> based off the two way interactions that were tested for significance
# e.g.
# age:heartdisease:hypertension
# bmi:hypertension:evermarried
# avgglucoselevel:hypertension:heartdisease
# bmi:evermarried:hypertension
# age:heartdisease:smokingstatus
fit_int19 <- glm(stroke ~ age*heartdisease*hypertension, data=complete_data, family = "binomial")
summary(fit_int19)
fit_int20<- glm(stroke ~ bmi*hypertension*evermarried, data=complete_data, family = "binomial")
summary(fit_int20)
fit_int21 <- glm(stroke ~ avgglucoselevel*hypertension*heartdisease, data=complete_data, family = "binomial")
summary(fit_int21)
fit_int22 <- glm(stroke ~ bmi*evermarried*hypertension, data=complete_data, family = "binomial")
summary(fit_int22)
fit_int23 <- glm(stroke ~ age*heartdisease*smokingstatus, data=complete_data, family = "binomial")
summary(fit_int23)
# All statistically Insignificant